資料來源:
對資料表進行初步檢視後,進行整理,以便後續進行資料視覺化。
import packages
library(htmlwidgets)
library(webshot)
library(dplyr)
library(zoo)
library(pracma)
library(ggplot2)
library(plotly)
library(dygraphs)
library(xts)
library(caret)
library(tidyr)
# library(ggpmisc)定義後續使用之函數
# 計算標準化數據與其七日簡單平均、七日指數平均、變動率
getMA_ROC <- function(df, prefix){
df$Date <- as.Date(df$Date)
df[prefix] <- as.numeric(unlist(df[prefix]))
preproc1 <- preProcess(df[prefix], method=c("center", "scale"))
df['norm'] <- predict(preproc1, df[prefix])
df['ROC'] <- (df[prefix] - lag(df[prefix]))/(lag(df[prefix])+1)
df['7MA'] <- movavg(unlist(df['norm']),7, type="s")
df['7EMA'] <- movavg(unlist(df['norm']),7, type="e")
colnames(df) <- c("Date", prefix, paste0(prefix, c("_std","_ROC", "_7MA", "_7EMA")))
return(df)
}
# 計算台灣各類股繪圖資料
tw_cat_rmse <- function(df){
na = colnames(df[2])
df['real'] <- df[5]
df['pred'] <- df[3]
df['origin'] <- df[4]
df['residues'] = df['pred'] - df['real']
df['group'] = c(rep(seq(as.Date("2021-04-03"), as.Date("2021-05-28"), 5), each = 5), rep(as.Date("2021-06-02"),3)) # 補最後三個
df = df %>% group_by(group) %>%
summarise(rmse = sqrt(mean(sum(residues**2))), cases = sum(origin), sign = mean(sum(residues)), category = na)
return(df)
}# read data
list.files("../rlads_Final_Project/data/stock")## [1] "BTC-Oil-Gold-COVID.csv" "SP500.csv" "SP500_USnewcases.csv"
## [4] "TAIEX-COVID.csv" "TAIEX.csv" "TAIEX_7MA.csv"
## [7] "TAIEX_ROC.csv"
# [1] "BTC-Oil-Gold-COVID.csv" "SP500.csv" "SP500_USnewcases.csv" "TAIEX-COVID.csv" "TAIEX.csv" "TAIEX_7MA.csv" "TAIEX_ROC.csv" btc_oil_gold = read.csv("../rlads_Final_Project/data/stock/BTC-Oil-Gold-COVID.csv", encoding = "UTF-8") # 比特幣-原油-黃金價格,七日簡單均線,變動率
US_SP500_ALL = read.csv("../rlads_Final_Project/data/stock/SP500.csv", encoding = "UTF-8") # 美股標普五百指數原始數值
US_SP500_CAT = read.csv("../rlads_Final_Project/data/stock/SP500_USnewcases.csv", encoding = "UTF-8") # 美股標普五百各類原始數值
TAIEX_COVID = read.csv("../rlads_Final_Project/data/stock/TAIEX-COVID.csv", encoding = "UTF-8") # 台股大盤,七日簡單均線,變動率
TAIEX = read.csv("../rlads_Final_Project/data/stock/TAIEX.csv", encoding = "UTF-8") # 台股類股原始數值
TAIEX_7MA = read.csv("../rlads_Final_Project/data/stock/TAIEX_7MA.csv", encoding = "UTF-8") # 台股類股七日簡單均線
TAIEX_ROC = read.csv("../rlads_Final_Project/data/stock/TAIEX_ROC.csv", encoding = "UTF-8") # 台股類股變動率
# 整合至資料列表
data_list = list(btc_oil_gold, US_SP500_ALL, US_SP500_CAT, TAIEX_COVID, TAIEX, TAIEX_7MA, TAIEX_ROC)
names(data_list) = gsub("-", "_",list.files('data\\stock'))# get column names
col_names = lapply(data_list, colnames)# rename data frame columns
col_names[[7]] = paste0(col_names[[7]], "_ROC")
col_names[[6]] = paste0(col_names[[6]], "_7MA")
col_names[[4]] = c("Date", "大盤", "大盤_ROC", "大盤_7MA", "大盤_7EMA", "TW_COVID", "TW_COVID_ROC" ,"TW_COVID_7MA", "TW_COVID_7EMA", "TW_COVID_CUM")
col_names[[1]][1] = "Date"
# rename data frame columns
for (i in 1:length(data_list)){
names(data_list[[i]]) <- col_names[[i]]
}重新分類data frame,原本的data frame含有重複的欄位(例如疫情資料),僅保留所需欄位後,分類各類別資料並獨立各自成表。
TW_COVID = data_list[[4]][c("Date", "TW_COVID","TW_COVID_ROC" ,"TW_COVID_7MA", "TW_COVID_7EMA", "TW_COVID_CUM" )]
US_COVID = data_list[[1]][c("Date","new_cases","new_cases_ROC","new_cases_7MA", "total_cases")]
colnames(US_COVID) <- c("Date", "US_COVID","US_COVID_ROC" ,"US_COVID_7MA", "US_COVID_CUM")
BTC = data_list[[1]][c("Date","BTC","BTC_7MA", "BTC_ROC")]
OIL = data_list[[1]][c("Date","Oil","Oil_7MA", "Oil_ROC")]
GOLD = data_list[[1]][c("Date","Gold","Gold_7MA", "Gold_ROC")]
TAIEX_ALL = data_list[[4]][c("Date", "大盤", "大盤_ROC", "大盤_7MA", "大盤_7EMA" )]
TAIEX_CAT = list(data_list[[5]][,2:34], data_list[[6]][,2:34], data_list[[7]][,2:34]) # 只保留類股數據與日期
US_SP500_CAT[c("X", "NewCases")] <- NULL # 要另外計算7MA, ROC
# US_SP500_ALL, US_SP500_CAT要另外計算7MA, ROC# 整理後新的data_list
data_list = list(TW_COVID, US_COVID, BTC, OIL, GOLD, TAIEX_ALL, US_SP500_ALL)
head(data_list[[1]])## Date TW_COVID TW_COVID_ROC TW_COVID_7MA TW_COVID_7EMA TW_COVID_CUM
## 1 2021/4/1 6 0.0000000 3.428571 3.428571 1036
## 2 2021/4/2 3 -0.5596158 3.714286 3.321429 1039
## 3 2021/4/3 6 0.5596158 3.571429 3.991071 1045
## 4 2021/4/4 2 -0.8472979 3.571429 3.493304 1047
## 5 2021/4/5 1 -0.4054651 3.571429 2.869978 1048
## 6 2021/4/6 2 0.4054651 3.714286 2.652483 1050
col <- c("TW_COVID", "US_COVID", "BTC", "Oil", "Gold", "大盤", "SP500")
for (i in 1:length(data_list)){
pref <- col[i]
dat <- data_list[[i]]
data_list[[i]] <- getMA_ROC(dat[c("Date",pref)], prefix = pref)
}
head(data_list[[1]])## Date TW_COVID TW_COVID_std TW_COVID_ROC TW_COVID_7MA TW_COVID_7EMA
## 1 2021-04-01 6 -0.5921804 NA -0.5921804 -0.5921804
## 2 2021-04-02 3 -0.6062040 -0.4285714 -0.5991922 -0.5956863
## 3 2021-04-03 6 -0.5921804 0.7500000 -0.5968549 -0.5948098
## 4 2021-04-04 2 -0.6108785 -0.5714286 -0.6003608 -0.5988270
## 5 2021-04-05 1 -0.6155530 -0.3333333 -0.6033993 -0.6030085
## 6 2021-04-06 2 -0.6108785 0.5000000 -0.6046458 -0.6049760
台灣類股與美國類股資料數眾多,獨立處理。
# 台灣類股
TAIEX_CAT_list = list()
for (i in 2:length(TAIEX_CAT[[1]])){
dat <- TAIEX_CAT[[1]][c(1,i)]
pref <- colnames(dat)[2]
TAIEX_CAT_list[[i-1]] <- getMA_ROC(dat[c("Date",pref)], prefix = pref)
}
head(TAIEX_CAT_list[[1]])## Date 水泥 水泥_std 水泥_ROC 水泥_7MA 水泥_7EMA
## 1 2021-04-01 185.38 -1.505895 NA -1.505895 -1.505895
## 2 2021-04-02 185.46 -1.496297 0.0004292306 -1.501096 -1.503495
## 3 2021-04-03 185.54 -1.486699 0.0004290464 -1.496297 -1.499296
## 4 2021-04-04 185.62 -1.477100 0.0004288624 -1.491498 -1.493747
## 5 2021-04-05 185.70 -1.467502 0.0004286786 -1.486699 -1.487186
## 6 2021-04-06 185.78 -1.457904 0.0004284949 -1.481899 -1.479865
# 美國類股
US_SP500_CAT_list = list()
for (i in 2:length(US_SP500_CAT)){
dat <- US_SP500_CAT[c(1,i)]
pref <- colnames(dat)[2]
US_SP500_CAT_list[[i-1]] <- getMA_ROC(dat[c("Date",pref)], prefix = pref)
}
head(US_SP500_CAT_list[[1]])## Date Energy Energy_std Energy_ROC Energy_7MA Energy_7EMA
## 1 2020-01-01 460.17 2.409713 NA 2.409713 2.409713
## 2 2020-01-02 460.34 2.412384 0.0003686276 2.411048 2.410381
## 3 2020-01-03 458.77 2.387721 -0.0034031300 2.403272 2.404716
## 4 2020-01-04 459.96 2.406414 0.0025882506 2.404058 2.405140
## 5 2020-01-05 461.15 2.425108 0.0025815689 2.408268 2.410132
## 6 2020-01-06 462.34 2.443802 0.0025749216 2.414190 2.418550
這邊我們先採用初步分別作出各指數原始值與新增確診人數原始值及各指數變動率對新增確診人數變動率兩種動圖去探索其中的關聯性。
指數包含美國SP500指數、台股大盤指數、比特幣、石油、黃金五種標的。
temp_data = full_join(data_list[[7]], data_list[[2]], by = "Date")[c("Date","SP500_std","SP500_7MA", "SP500_7EMA","SP500_ROC","US_COVID_std")]
p <- temp_data %>%
ggplot( aes(US_COVID_std, SP500_7MA, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "美國SP500指數-COVID-19確診人數散佈圖/趨勢圖",x='單日確診人數(標準化)',y='標準化後指數之七日簡單移動平均') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")從此圖可以看出美國的股市只有在疫情初期受到劇烈的影響,疫情中期之後無論確診病例數為何,基本上標普500指數都是穩定上升,其中一個因素在於美國(甚至是全球各國)持續進行「量化寬鬆」的貨幣政策,針對特定對象進行「紓困」,並透過印鈔使市場上流動的資金增加,從而達到短期解決市場信心與消費不振的問題(參考資料)。
temp_data = full_join(data_list[[7]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_ROC, SP500_ROC, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "美國SP500指數變動率-COVID-19新增確診人數變動率散佈圖/趨勢圖",x='新增確診人數',y='指數變動率') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[6]], data_list[[1]], by = "Date") # [c("Date","大盤_std","大盤_7MA", "大盤_7EMA","大盤_ROC","TW_COVID_std")]
p <- temp_data %>%
ggplot( aes(TW_COVID_std, 大盤_7MA, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "台灣大盤指數-COVID-19確診人數散佈圖/趨勢圖", x='單日確診人數(標準化)', y='標準化後指數之七日簡單移動平均') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_7MA對TW_COVID_std.html")將大盤指數乘以100使圖表比較更直觀
temp_data = full_join(data_list[[6]], data_list[[1]], by = "Date") # [c("Date","大盤_std","大盤_7MA", "大盤_7EMA","大盤_ROC","TW_COVID_std")]
p <- temp_data %>%
ggplot( aes(TW_COVID_ROC, 大盤_ROC*100, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "台灣大盤指數變動率-COVID-19新增確診人數變動率散佈圖/趨勢圖", x='新增確診人數變動率', y='指數變動率') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_7MA對TW_COVID_std.html")temp_data = full_join(data_list[[3]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_std, BTC_7MA, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "比特幣-COVID-19確診人數散佈圖/趨勢圖",x='單日確診人數(標準化)',y='標準化後指數之七日簡單移動平均') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[3]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_ROC, BTC_ROC, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "比特幣變動率-COVID-19確診人數變動率散佈圖/趨勢圖",x='新增確診人數變動率',y='比特幣變動率') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[4]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_std, Oil_7MA, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "石油-COVID-19確診人數散佈圖/趨勢圖",x='單日確診人數(標準化)',y='標準化後指數之七日簡單移動平均') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[4]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_ROC, Oil_ROC, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "石油變動率-COVID-19確診人數散佈圖/趨勢圖",x='新增確診人數變動率',y='石油變動率') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[5]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_std, Gold_7MA, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "黃金-COVID-19確診人數散佈圖/趨勢圖",x='單日確診人數(標準化)',y='標準化後指數之七日簡單移動平均') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")temp_data = full_join(data_list[[5]], data_list[[2]], by = "Date")
p <- temp_data %>%
ggplot( aes(US_COVID_ROC, Gold_ROC, color=Date)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "黃金變動率-COVID-19確診人數散佈圖/趨勢圖",x='新增確診人數變動率',y='黃金變動率') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\SP500_7MA對US_COVID_std.html")初步看完以上資料的簡單關聯後,我們想利用rmse看不同時期病例數對指數的波動影響。
# 抓出繪圖所需資料
us_case = full_join(data_list[[2]][c("Date","US_COVID","US_COVID_std")], data_list[[7]][c("Date","SP500_std")],by = 'Date')
# 計算殘差
us_case['residues'] = us_case['SP500_std'] - us_case['US_COVID_std']
# 處理時間序列(以一個月為單位)
dates <- c("2020/01", "2020/02", "2020/03", "2020/04", "2020/05", "2020/06",
"2020/07", "2020/08", "2020/09", "2020/10", "2020/11", "2020/12",
"2021/01", "2021/02", "2021/03", "2021/04")
us_case['group'] = format(us_case['Date'], '%Y-%m')
# 每一個月加總一次算rmse
US_RMSE = us_case %>% group_by(group) %>%
summarise(rmse = sqrt(mean(sum(residues**2))), cases = sum(US_COVID), sign = mean(sum(residues)))
US_RMSE['group'] <- as.Date(as.yearmon(unlist(US_RMSE['group'])) )+14
# 繪圖
p <- ggplot( )+
geom_bar(data = US_RMSE, aes(x = group, y = cases/100000), stat = "identity") +
geom_point(data = US_RMSE, aes(x = group, y = rmse*5, colour = "rmse(/5)")) +
geom_line(data = US_RMSE, aes(x = group, y = rmse*5, colour = "rmse(/5)")) +
geom_line(data = US_RMSE, aes(x = group, y = sign, colour = "sign")) +
labs(title = "US_RMSE(monthly)", x ="日期", y = "一個月內累積確診人數 (*10^5)",color = 'lines') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\US_RMSE(monthly).html")若單純觀察美國每日新增確診人數與美國SP500指數的RMSE,可以發現主要的波動有兩次,第一次落在2020年3月至4月,而第二次則是落在2020年12月至2021年1月。第一次的RMSE波動主要就是受到新冠肺炎疫情第一次升溫的影響(佐證資料)。第二次的RMSE波動則與第二次疫情升溫沒有太大影響,反而是因為美國2020年12月開始實施一系列疫情的應對措施,使得投資這對未來疫情的控制充滿了希望(佐證資料)。
dfRMSE = read.csv("RMSE.csv", encoding ="UTF-8")
dfRMSE['Date']<- as.Date(as.yearmon(unlist(dfRMSE['Date']), "%Y/%m"))+14
casesCol = data_list[[2]][c("Date", "US_COVID") ]
casesCol['Date']<- as.Date(unlist(casesCol['Date']))
casesCol['Date'] = format(casesCol['Date'], "%Y-%m")
casesCol['US_COVID'] <- as.numeric(unlist(casesCol['US_COVID']))
casesCol<- casesCol %>%
group_by(Date) %>%
summarise(sum = sum(US_COVID ))
casesCol['Date'] <- as.Date(as.yearmon(unlist(casesCol['Date']), "%Y-%m"))+14
p <- ggplot() +
geom_bar(data = casesCol, aes(x = Date, y = sum*100/0.3), stat = "identity") +
geom_point(data = dfRMSE, aes(x = Date, y = RMSE, group = Category, color = Category)) +
geom_line(data = dfRMSE, aes(x = Date, y = RMSE, group = Category, color = Category)) +
# 副座標軸:Cases (調整刻度)
scale_y_continuous(sec.axis = sec_axis(~. *0.3/100, name = "Cases")) +
labs(title = "11 Categories' RMSE in S&P500") +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\11_Categories_RMSE_SP500.html")該圖表將美國SP500的11個產業類股進行分別觀察,可以看出2020年11月以前的每日確診數與各類股指數波動都有一定的相關性,12月到1月這段期間的RMSE明顯提高,顯示出在這段期間類股指數的變動率遠大於確診數。也就是說美國新增確診數從1月逐漸下降,但是下降幅度不及各大類股上升的幅度。以下取出在2020年12月至2021年1月之間,RMSE波動最大的兩個產業類股與波動最小的兩個類股進行分析:
# btc, oil, gold
BTC_OIL_GOLD_PLOT <- list()
BTC_OIL_GOLD_PLOT[[1]] <- data_list[[3]]
BTC_OIL_GOLD_PLOT[[2]] <- data_list[[4]]
BTC_OIL_GOLD_PLOT[[3]] <- data_list[[5]]
BTC_OIL_GOLD_PLOT[[1]]['group'] = 'BTC'
BTC_OIL_GOLD_PLOT[[2]]['group'] = 'OIL'
BTC_OIL_GOLD_PLOT[[3]]['group'] = 'GOLD'
names(BTC_OIL_GOLD_PLOT[[1]]) <- c('Date','origin','std','ROC', '7MA', '7EMA', 'group')
names(BTC_OIL_GOLD_PLOT[[2]]) <- c('Date','origin','std','ROC', '7MA', '7EMA', 'group')
names(BTC_OIL_GOLD_PLOT[[3]]) <- c('Date','origin','std','ROC', '7MA', '7EMA', 'group')
# 抓出繪圖所需資料
temp_data = rbind(BTC_OIL_GOLD_PLOT[[1]],BTC_OIL_GOLD_PLOT[[2]], BTC_OIL_GOLD_PLOT[[3]])
BOG_case = full_join(data_list[[2]][c("Date","US_COVID","US_COVID_std")], temp_data, by = 'Date')
# 計算殘差
BOG_case['residues'] = BOG_case['std'] - BOG_case['US_COVID_std']
# 處理時間序列(以一個月為單位)
BOG_case['month'] = format(BOG_case['Date'], '%Y-%m')
# 計算總表
BOG_RMSE = BOG_case %>% group_by(group, month) %>%
summarise(rmse = sqrt(mean(sum(residues**2))), cases = sum(US_COVID), sign = mean(sum(residues)))
BOG_RMSE['month'] <- as.Date(as.yearmon(unlist(BOG_RMSE['month'])) )+14
# 繪圖
p <- ggplot( )+
geom_bar(data = BOG_RMSE, aes(x = month, y = cases/100000), stat = "identity") +
geom_point(data = BOG_RMSE, aes(x = month, y = rmse*7, group = group, colour = group)) +
geom_line(data = BOG_RMSE, aes(x = month, y = rmse*7, group = group, colour = group)) +
geom_line(linetype="dashed",data = BOG_RMSE, aes(x = month, y = sign, group = group, colour = group)) +
labs(title = "BTC_OIL_GOLD_RMSE(monthly)", x ="日期", y = "一個月內累積確診人數 (*10^5)",color = 'lines') +
theme(text=element_text(size=25, family="A"))+
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\BTC_OIL_GOLD_RMSE(monthly).html")# 每五天加總一次算rmse
tw_case = full_join(data_list[[1]][c("Date","TW_COVID","TW_COVID_std")], data_list[[6]][c("Date","大盤_std")],by = 'Date')
# 計算殘差
tw_case['residues'] = tw_case['大盤_std'] - tw_case['TW_COVID_std']
# 每五天分為一類進行RMSE計算
tw_case['group'] = c(rep(seq(as.Date("2021-04-03"), as.Date("2021-05-28"), 5), each = 5), rep(as.Date("2021-06-02"),3)) # 補最後三個
TW_RMSE = tw_case %>% group_by(group) %>%
summarise(rmse = sqrt(mean(sum(residues**2))), cases = sum(TW_COVID), sign = mean(sum(residues)))
# 繪圖
p <- ggplot() +
geom_bar(data = TW_RMSE, aes(x = group, y = cases*0.4/100), stat = "identity") +
geom_point(data = TW_RMSE, aes(x = group, y = rmse, colour = "rmse")) +
geom_line(data = TW_RMSE, aes(x = group, y = rmse, color = "rmse")) +
geom_line(linetype="dashed",data = TW_RMSE, aes(x = group, y = sign, color = "sign")) +
labs(title = "TAIEX_RMSE(5 DAYS)", x ="日期", y = "五天內累積確診人數 (*100/0.4)",color = 'lines') +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_RMSE(5 DAYS).html")台灣類股資料處理
TAIEX_CAT_plot = data.frame()
TAIEX_CAT_w_COVID = list()
for (i in 1:length(TAIEX_CAT_list)){
tw_case = full_join(TAIEX_CAT_list[[i]][c(1:3)],data_list[[1]][c("Date","TW_COVID","TW_COVID_std")], by = 'Date')
TAIEX_CAT_plot = rbind(TAIEX_CAT_plot,data.frame(tw_cat_rmse(tw_case)))
TAIEX_CAT_w_COVID[[i]] <- data.frame(tw_cat_rmse(tw_case))
}
head(TAIEX_CAT_plot)## group rmse cases sign category
## 1 2021-04-03 1.975718 18 -4.416496 水泥
## 2 2021-04-08 1.940253 8 -4.335758 水泥
## 3 2021-04-13 1.378141 12 -2.845938 水泥
## 4 2021-04-18 2.106813 10 4.273679 水泥
## 5 2021-04-23 4.851571 22 10.656778 水泥
## 6 2021-04-28 3.701146 28 8.206385 水泥
head(TAIEX_CAT_w_COVID)## [[1]]
## group rmse cases sign category
## 1 2021-04-03 1.975718 18 -4.416496 水泥
## 2 2021-04-08 1.940253 8 -4.335758 水泥
## 3 2021-04-13 1.378141 12 -2.845938 水泥
## 4 2021-04-18 2.106813 10 4.273679 水泥
## 5 2021-04-23 4.851571 22 10.656778 水泥
## 6 2021-04-28 3.701146 28 8.206385 水泥
## 7 2021-05-03 2.227369 32 4.801921 水泥
## 8 2021-05-08 3.687012 39 8.226952 水泥
## 9 2021-05-13 1.483121 276 0.343959 水泥
## 10 2021-05-18 2.670477 1350 -5.309960 水泥
## 11 2021-05-23 3.766543 2631 -7.785081 水泥
## 12 2021-05-28 4.135384 2704 -8.575037 水泥
## 13 2021-06-02 NA 1229 NA 水泥
##
## [[2]]
## group rmse cases sign category
## 1 2021-04-03 1.1039546 18 -2.4361176 食品
## 2 2021-04-08 0.5875681 8 -1.2647605 食品
## 3 2021-04-13 0.4487647 12 -0.8544397 食品
## 4 2021-04-18 2.8072908 10 5.7521719 食品
## 5 2021-04-23 3.7269371 22 8.3082600 食品
## 6 2021-04-28 4.8757079 28 10.5950537 食品
## 7 2021-05-03 3.8612663 32 8.0279943 食品
## 8 2021-05-08 3.6644260 39 8.0605632 食品
## 9 2021-05-13 2.1147026 276 -1.2098553 食品
## 10 2021-05-18 3.4676768 1350 -7.2524550 食品
## 11 2021-05-23 5.2021935 2631 -10.9593844 食品
## 12 2021-05-28 5.9686786 2704 -12.7463941 食品
## 13 2021-06-02 NA 1229 NA 食品
##
## [[3]]
## group rmse cases sign category
## 1 2021-04-03 0.6595552 18 -1.4539724 塑膠
## 2 2021-04-08 0.5321225 8 -0.1471981 塑膠
## 3 2021-04-13 1.6949282 12 3.4645667 塑膠
## 4 2021-04-18 4.4631510 10 9.9216069 塑膠
## 5 2021-04-23 3.8687476 22 8.6442750 塑膠
## 6 2021-04-28 3.7724747 28 8.3975065 塑膠
## 7 2021-05-03 3.2099512 32 7.1402532 塑膠
## 8 2021-05-08 3.2747842 39 7.1752434 塑膠
## 9 2021-05-13 2.4509069 276 -3.7500848 塑膠
## 10 2021-05-18 4.6418145 1350 -10.1245281 塑膠
## 11 2021-05-23 6.1664395 2631 -13.3664811 塑膠
## 12 2021-05-28 5.4896844 2704 -11.8263366 塑膠
## 13 2021-06-02 NA 1229 NA 塑膠
##
## [[4]]
## group rmse cases sign category
## 1 2021-04-03 2.6092270 18 -5.828981 紡織
## 2 2021-04-08 1.7831308 8 -3.788187 紡織
## 3 2021-04-13 0.8597888 12 -1.837450 紡織
## 4 2021-04-18 1.4074246 10 2.890790 紡織
## 5 2021-04-23 1.8676038 22 4.120869 紡織
## 6 2021-04-28 2.3561408 28 5.185687 紡織
## 7 2021-05-03 2.6130168 32 5.779520 紡織
## 8 2021-05-08 4.2417601 39 9.355849 紡織
## 9 2021-05-13 2.7161606 276 2.569889 紡織
## 10 2021-05-18 3.0668011 1350 -6.164871 紡織
## 11 2021-05-23 2.7175041 2631 -5.334274 紡織
## 12 2021-05-28 2.2418431 2704 -4.151119 紡織
## 13 2021-06-02 NA 1229 NA 紡織
##
## [[5]]
## group rmse cases sign category
## 1 2021-04-03 1.753062 18 3.913094 電機
## 2 2021-04-08 3.070448 8 6.732890 電機
## 3 2021-04-13 2.980063 12 6.604598 電機
## 4 2021-04-18 3.314377 10 7.407671 電機
## 5 2021-04-23 2.964677 22 6.563831 電機
## 6 2021-04-28 4.081344 28 9.098614 電機
## 7 2021-05-03 2.985621 32 6.168047 電機
## 8 2021-05-08 1.564066 39 3.448289 電機
## 9 2021-05-13 2.887599 276 -5.854297 電機
## 10 2021-05-18 5.340982 1350 -11.737978 電機
## 11 2021-05-23 6.566642 2631 -14.355650 電機
## 12 2021-05-28 6.126815 2704 -13.379775 電機
## 13 2021-06-02 NA 1229 NA 電機
##
## [[6]]
## group rmse cases sign category
## 1 2021-04-03 1.6174211 18 -3.615504 電纜
## 2 2021-04-08 1.2578495 8 -2.780159 電纜
## 3 2021-04-13 0.7426281 12 -1.577766 電纜
## 4 2021-04-18 0.9465281 10 1.965829 電纜
## 5 2021-04-23 1.5835264 22 3.443582 電纜
## 6 2021-04-28 4.5464617 28 9.877612 電纜
## 7 2021-05-03 4.1897434 32 9.238563 電纜
## 8 2021-05-08 5.1066677 39 11.388424 電纜
## 9 2021-05-13 2.1238049 276 1.698866 電纜
## 10 2021-05-18 3.0290207 1350 -6.426424 電纜
## 11 2021-05-23 4.9981911 2631 -10.706199 電纜
## 12 2021-05-28 4.3920201 2704 -9.085231 電纜
## 13 2021-06-02 NA 1229 NA 電纜
因台灣類股分類數量太多,不易呈現。我們將32類股分為:工業相關、電子相關與服務相關。
# 類股分類
unique(TAIEX_CAT_plot$category)## [1] "水泥" "食品" "塑膠" "紡織" "電機" "電纜" "玻璃" "造紙"
## [9] "鋼鐵" "橡膠" "汽車" "營建" "航運" "觀光" "金融" "百貨"
## [17] "其他" "化學" "生技" "油電" "半導體" "電腦" "光電" "通信"
## [25] "零組件" "電通" "資服" "其他電" "化生" "電子" "窯製" "化工"
# 工業相關
industrial = c("塑膠", "玻璃", "水泥", "造紙", "鋼鐵","橡膠", "窯製", "油電","窯製","化工","化學", "化生", "生技")
# 電子相關
electronics = c("半導體" ,"電腦","光電" , "電通" , "零組件" ,"電子", "電機", "電纜","其他電")
# 服務相關
service = c("食品", "紡織","營建", "航運" , "觀光" , "金融" , "百貨", "通信" , "資服","其他" )p <- ggplot() +
geom_bar(data = TW_RMSE, aes(x = group, y = cases*0.3/100), stat = "identity") +
geom_point(data = TAIEX_CAT_plot%>%filter(category %in%industrial), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(data = TAIEX_CAT_plot%>%filter(category %in%industrial), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(linetype="dashed",data = TAIEX_CAT_plot%>%filter(category %in%industrial), aes(x = group, y = sign, group = category, color = category)) +
# 副座標軸:Cases (調整刻度)
# scale_y_continuous(sec.axis = sec_axis(~. *100/0.3, name = "Cases")) +
labs(title = "TAIEX_工業相關類股分類_RMSE(5 DAYS)", x ="日期", y = "五天內累積確診人數 (*100/0.3)") +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_工業相關類股分類_RMSE(5 DAYS).html")p <- ggplot() +
geom_bar(data = TW_RMSE, aes(x = group, y = cases*0.3/100), stat = "identity") +
geom_point(data = TAIEX_CAT_plot%>%filter(category %in%electronics), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(data = TAIEX_CAT_plot%>%filter(category %in%electronics), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(linetype="dashed",data = TAIEX_CAT_plot%>%filter(category %in%electronics), aes(x = group, y = sign, group = category, color = category)) +
# 副座標軸:Cases (調整刻度)
scale_y_continuous(sec.axis = sec_axis(~. *100/0.3, name = "Cases")) +
labs(title = "TAIEX_電子相關類股分類_RMSE(5 DAYS)", x ="日期", y = "五天內累積確診人數 (*100/0.3)") +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_電子相關類股分類_RMSE(5 DAYS).html")p <- ggplot() +
geom_bar(data = TW_RMSE, aes(x = group, y = cases*0.3/100), stat = "identity") +
geom_point(data = TAIEX_CAT_plot%>%filter(category %in%service), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(data = TAIEX_CAT_plot%>%filter(category %in%service), aes(x = group, y = rmse, group = category, color = category)) +
geom_line(linetype="dashed",data = TAIEX_CAT_plot%>%filter(category %in%service), aes(x = group, y = sign, group = category, color = category)) +
# 副座標軸:Cases (調整刻度)
scale_y_continuous(sec.axis = sec_axis(~. *100/0.3, name = "Cases")) +
labs(title = "TAIEX_服務相關類股分類_RMSE(5 DAYS)", x ="日期", y = "五天內累積確診人數 (*100/0.3)") +
theme_bw()
ggplotly(p)# saveWidget(ggplotly(p), file="plot\\stock\\TAIEX_服務相關類股分類_RMSE(5 DAYS).html")將台股類股指數變動率乘上100使數據直觀
for (i in 1:32){
df = TAIEX_CAT_list[[i]]
name = colnames(df)[2]
colnames(df)[3:6] <- c("std","ROC", "7MA", "7EMA")
df = full_join(df, data_list[[1]], by = "Date")
p <- ggplot(df, aes(TW_COVID_ROC , ROC*100, color=Date)) +
geom_point() +
geom_smooth(method = "lm")+
ylim(-2.5,2.5)+
labs(title = paste0(name,"類股_變動率對單日確診人數變動率作圖"), x ="單日確診人數變動率", y = "類股指數變動率") +
theme_bw()
ggplotly(p)
# saveWidget(ggplotly(p), file=paste0("plot\\stock\\",name,"類股_變動率對單日確診人數作圖.html"))
}以觀光類股為例
以光電類股為例
## 台股
# 大盤
TAIEX_COVID_df <- as_tibble(readr::read_csv("../rlads_Final_Project/data/cvaw/TAIEX-COVID.csv")) %>%
mutate(Date = as.Date(X1)) %>%
select(-X1) %>%
filter(between(Date, as.Date("2021-04-01"), as.Date("2021-05-31")))
# 類股
TAIEX_df <- as_tibble(readr::read_csv("../rlads_Final_Project/data/cvaw/TAIEX.csv")) %>%
select(-X1) %>%
mutate(Date = as.Date(Date)) %>%
filter(between(Date, as.Date("2021-04-01"), as.Date("2021-05-31")))
# 類股變動率
TAIEX_ROC_df <- as_tibble(readr::read_csv("../rlads_Final_Project/data/cvaw/TAIEX_ROC.csv")) %>%
select(-X1) %>%
mutate(Date = as.Date(Date)) %>%
filter(between(Date, as.Date("2021-04-01"), as.Date("2021-05-31")))
plt_df_cvaw_covid <- as_tibble(readr::read_csv("../rlads_Final_Project/data/cvaw/cvaw_covid.csv")) %>%
mutate(Date = as.Date(Date)) %>%
filter(between(Date, as.Date("2021-04-01"), as.Date("2021-05-31"))) %>%
mutate(score_sum = round(Valence_Sum * (Arousal_Sum + words_Num * 5), 6),
score_avg = if_else(score_sum == 0, 0, round(score_sum / ((words_Num)** 2), 6)))
# stock板
plt_df_cvaw_stock <- as_tibble(readr::read_csv("../rlads_Final_Project/data/cvaw/cvaw_stock.csv")) %>%
mutate(Date = as.Date(Date)) %>%
filter(between(Date, as.Date("2021-04-01"), as.Date("2021-05-31"))) %>%
mutate(score_sum = round(Valence_Sum * (Arousal_Sum + words_Num * 5), 6),
score_avg = if_else(score_sum == 0, 0, round(score_sum / ((words_Num)** 2), 6)))## 做每日平均分數
# covid板
group_plt_df_cvaw_covid <- plt_df_cvaw_covid %>%
group_by(Date) %>%
summarise(score_sum_date = mean(score_sum), score_avg_date = mean(score_avg)) %>%
ungroup()
# stock板
group_plt_df_cvaw_stock <- plt_df_cvaw_stock %>%
group_by(Date) %>%
summarise(score_sum_date = mean(score_sum), score_avg_date = mean(score_avg)) %>%
ungroup()temp_p = ggplot(data = plt_df_cvaw_covid)+
geom_point(aes(x = Valence_Avg, y = Arousal_Avg, color = Date), size = 0.1)+
theme_bw()
ggplotly(temp_p)temp_p = ggplot(data = plt_df_cvaw_covid)+
geom_point(aes(x = Valence_Sum, y = Arousal_Sum, color = Date), size = 0.1)+
theme_bw()
ggplotly(temp_p)#
p1<-ggplot()+
geom_hline(yintercept = 0)+
geom_line(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date), color = "red")+
geom_line(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date), color = "blue")+
labs(title = "COVID版與股版日期對平均情緒分數折線圖" , x = "日期", y= "平均情緒分數")+
theme_bw()
ggplotly(p1)#
p2<-ggplot()+
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)+
geom_point(aes(x=group_plt_df_cvaw_covid$score_avg_date, y=group_plt_df_cvaw_stock$score_avg_date))+
geom_smooth(aes(x=group_plt_df_cvaw_covid$score_avg_date, y=group_plt_df_cvaw_stock$score_avg_date), method="lm")+
labs(title = "COVID版對股版平均情緒分數散佈圖" , x = "COVID版平均情緒分數", y= "股版平均情緒分數")+
theme_bw()
ggplotly(p2)# covid-19板對新增人數變動率
temp_p = ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_covid, aes(x = Date , y = score_avg), color = ifelse(plt_df_cvaw_covid$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_COVID_df$日新增案例變動率 >= 0, "red", "darkgreen"),
size = abs(TAIEX_COVID_df$日新增案例變動率)*3) +
theme_bw() +
labs(title = "Covid-19版情緒分析-對新增病例變動率")
ggplotly(temp_p)# stock板對新增人數變動率
temp_p =ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_stock, aes(x = Date, y = score_avg), color = ifelse(plt_df_cvaw_stock$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*10), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*10),
color = ifelse(TAIEX_COVID_df$日新增案例變動率 >= 0, "red", "darkgreen"),
size = abs(TAIEX_COVID_df$日新增案例變動率) * 3) +
theme_bw() +
labs(title = "股版情緒分析-對新增病例變動率")
ggplotly(temp_p)# covid-19板對大盤變動率
temp_p = ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_covid, aes(x = Date , y = score_avg), color = ifelse(plt_df_cvaw_covid$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_COVID_df$大盤變動率 >= 0, "red", "darkgreen"),
size = abs(TAIEX_COVID_df$大盤變動率)*100) +
theme_bw() +
labs(title = "Covid-19版情緒分析-對大盤變動率")
ggplotly(temp_p)# stock板對新增人數變動率
temp_p =ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_stock, aes(x = Date, y = score_avg), color = ifelse(plt_df_cvaw_stock$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*10), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*10),
color = ifelse(TAIEX_COVID_df$大盤變動率 >= 0, "red", "darkgreen"),
size = abs(TAIEX_COVID_df$大盤變動率) * 100) +
theme_bw() +
labs(title = "股版情緒分析-對大盤變動率")
ggplotly(temp_p)###觀光類股
# coivd-19版對類股
temp_p= ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_covid, aes(x = Date , y = score_avg), color = ifelse(plt_df_cvaw_covid$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_ROC_df$觀光 >= 0, "red", "darkgreen"),
size = abs(TAIEX_ROC_df$觀光) * 100) +
theme_bw() +
labs(title = "Covid-19版情緒分析-對觀光類股變動率")
ggplotly(temp_p)# stock板對類股
temp_p=ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_stock, aes(x = Date, y = score_avg), color = ifelse(plt_df_cvaw_stock$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_ROC_df$觀光 >= 0, "red", "darkgreen"),
size = abs(TAIEX_ROC_df$觀光) * 100) +
theme_bw() +
labs(title = "股版情緒分析-對觀光類股變動率")
ggplotly(temp_p)###光電類股
# coivd-19版對類股
temp_p= ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_covid, aes(x = Date , y = score_avg), color = ifelse(plt_df_cvaw_covid$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_covid, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_ROC_df$光電 >= 0, "red", "darkgreen"),
size = abs(TAIEX_ROC_df$光電) * 100) +
theme_bw() +
labs(title = "Covid-19版情緒分析-對光電類股變動率")
ggplotly(temp_p)# stock板對類股
temp_p=ggplot()+
geom_hline(yintercept = 0) +
geom_point(data = plt_df_cvaw_stock, aes(x = Date, y = score_avg), color = ifelse(plt_df_cvaw_stock$score_avg >= 0, "pink", "lightgreen"), size = 1)+
geom_line(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*5), color = "darkgray") +
geom_point(data = group_plt_df_cvaw_stock, aes(x = Date, y = score_avg_date*5),
color = ifelse(TAIEX_ROC_df$光電 >= 0, "red", "darkgreen"),
size = abs(TAIEX_ROC_df$光電) * 100) +
theme_bw() +
labs(title = "股版情緒分析-對光電類股變動率")
ggplotly(temp_p)感謝閱讀。